{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "3560a64c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://huggingface.co/datasets/mesolitica/malaysian-benchmark/resolve/main/iium-coffession-en.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "14d402ec",
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "import requests\n",
    "import os\n",
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "4c88bdf5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1000"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open('iium-coffession-en.json') as fopen:\n",
    "    d = json.load(fopen)\n",
    "    \n",
    "len(d)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "4795de0a",
   "metadata": {},
   "outputs": [],
   "source": [
    "!mkdir small-iium-en"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "94ead50b",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoTokenizer, T5ForConditionalGeneration\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained('mesolitica/nanot5-small-malaysian-translation-v2')\n",
    "model = T5ForConditionalGeneration.from_pretrained('mesolitica/nanot5-small-malaysian-translation-v2')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "dc7541c0",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_special_ids = [0, 1, 2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "dbbabbb3",
   "metadata": {},
   "outputs": [],
   "source": [
    "l = d[0]['from']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "e46396c2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The problem of acne will worsen if using alkaline/high pH facial cleansers, and when using low pH facial cleansers, acne will decrease and the skin will improve (If you are diligent, you can read the study Korting et al, the title 'The Influence of the Regular Use of a Soap or an Acidic Syndet Bar on Pre-Acne', not sure if you can find it on Google, but in the blog I read it, there is a direct link to the pdf file study).\n"
     ]
    }
   ],
   "source": [
    "input_ids = tokenizer.encode(f'terjemah ke Inggeris: {l}{tokenizer.eos_token}', return_tensors = 'pt')\n",
    "outputs = model.generate(input_ids, max_length = 1024, num_beams=5, early_stopping=True)\n",
    "outputs = [i for i in outputs[0] if i not in all_special_ids]\n",
    "print(tokenizer.decode(outputs, spaces_between_special_tokens = False).strip())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "4ed56e3f",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [08:11<00:00,  2.04it/s]\n"
     ]
    }
   ],
   "source": [
    "for i in tqdm(range(len(d))):\n",
    "    filename = os.path.join('small-iium-en', f'{i}.json')\n",
    "    \n",
    "    if os.path.exists(filename):\n",
    "        continue\n",
    "    \n",
    "    headers = {\n",
    "        'accept': 'application/json',\n",
    "        'Content-Type': 'application/json',\n",
    "    }\n",
    "    \n",
    "    l = d[i]['from']\n",
    "    \n",
    "    input_ids = tokenizer.encode(f'terjemah ke Inggeris: {l}{tokenizer.eos_token}', return_tensors = 'pt')\n",
    "    outputs = model.generate(input_ids, max_length = 1024, num_beams=5, early_stopping=True)\n",
    "    outputs = [i for i in outputs[0] if i not in all_special_ids]\n",
    "    r = tokenizer.decode(outputs, spaces_between_special_tokens = False).strip()\n",
    "\n",
    "    with open(filename, 'w') as fopen:\n",
    "        json.dump({'text': l, 'r': r}, fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "e2f99f63",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sacrebleu.metrics import BLEU, CHRF, TER\n",
    "\n",
    "chrf = CHRF(word_order = 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "7e04b945",
   "metadata": {},
   "outputs": [],
   "source": [
    "predicted, actual = [], []\n",
    "for i in range(len(d)):\n",
    "    try:\n",
    "        filename = os.path.join('small-iium-en', f'{i}.json')\n",
    "        with open(filename) as fopen:\n",
    "            d_ = json.load(fopen)\n",
    "        predicted.append(d_['r'])\n",
    "        actual.append(d[i]['to'])\n",
    "    except:\n",
    "        pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "45bafef8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "chrF2++ = 55.24"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "score = chrf.corpus_score(predicted, [actual])\n",
    "score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8a9a91a5",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
